Using a movies reviews by users dataset provided from https://grouplens.org/datasets/movielens/latest/ We will try (by analyzing the data) to predict the most profitable movie we can produce to have a lot of mony
The following 7 chunks of code will be the booring cleaning and preproccessing the data But basicly what we will do is to find the best genre from the rates and views, then take the best movies in this genre and see what tags they have
first of all the libraries we will use
library(reshape2)
library(ggplot2)
library(hrbrthemes)
library(stringr)
library(dplyr)
library(wordcloud2)
options(scipen = 999)
The data provided by 3 csv files which can know more about it from but here is a brief view from each table
movies = read.csv("Data/movies.csv")
ratings = read.csv("Data/ratings.csv")
tags = read.csv("Data/tags.csv")
head(movies)
## movieId title
## 1 1 Toy Story (1995)
## 2 2 Jumanji (1995)
## 3 3 Grumpier Old Men (1995)
## 4 4 Waiting to Exhale (1995)
## 5 5 Father of the Bride Part II (1995)
## 6 6 Heat (1995)
## genres
## 1 Adventure|Animation|Children|Comedy|Fantasy
## 2 Adventure|Children|Fantasy
## 3 Comedy|Romance
## 4 Comedy|Drama|Romance
## 5 Comedy
## 6 Action|Crime|Thriller
head(ratings)
## userId movieId rating timestamp
## 1 1 1 4 964982703
## 2 1 3 4 964981247
## 3 1 6 4 964982224
## 4 1 47 5 964983815
## 5 1 50 5 964982931
## 6 1 70 3 964982400
head(tags)
## userId movieId tag timestamp
## 1 2 60756 funny 1445714994
## 2 2 60756 Highly quotable 1445714996
## 3 2 60756 will ferrell 1445714992
## 4 2 89774 Boxing story 1445715207
## 5 2 89774 MMA 1445715200
## 6 2 89774 Tom Hardy 1445715205
assigning data columns classes to the right classes
#Classing data
ratings$timestamp = as.POSIXct(ratings$timestamp , origin = '1970-1-1' , tz = "UTC")
tags$timestamp = as.POSIXct(tags$timestamp , origin = '1970-1-1' , tz = "UTC")
tags$tag = as.character(tags$tag)
tags$movieId = as.character(tags$movieId)
movies$title = as.character(movies$title)
movies$genres = as.character(movies$genres)
#for future use
movies2 = movies
Seperating the movie title from the year
#cleaning data
year = vector()
for(i in 1:nrow(movies))
{
length = str_length(movies[i,"title"] )
year = c(year,
substr(as.character(movies[i ,"title"]) ,
length - 4 , length -1
)
)
movies[i,"title"] = substr(movies[i, "title"], 1 ,
length - 7)
}
movies$year = year
head(movies)
## movieId title
## 1 1 Toy Story
## 2 2 Jumanji
## 3 3 Grumpier Old Men
## 4 4 Waiting to Exhale
## 5 5 Father of the Bride Part II
## 6 6 Heat
## genres year
## 1 Adventure|Animation|Children|Comedy|Fantasy 1995
## 2 Adventure|Children|Fantasy 1995
## 3 Comedy|Romance 1995
## 4 Comedy|Drama|Romance 1995
## 5 Comedy 1995
## 6 Action|Crime|Thriller 1995
creating a new data frame to contain all the genres with the total rate , views for each one
#creating a new data frame to contain all the genres with the total rate , views for each one
genres = data.frame(genre = "",
rate =0 ,
views = 0 ,
year = 0
)
Looping through each movie’s genres and add 1 to views of that genre and add the rate of the movie to the rate of the genre
#
for(i in 1:nrow(ratings) )
{
id = ratings[i , "movieId"]
rate = ratings[i, "rating"]
year = format(as.Date(as.POSIXct( ratings[i , "timestamp"] , tz = "UTC")), "%Y")
g = strsplit( movies[movies$movieId==id , "genres"] , "\\|")
for(j in g[[1]] )
{
if(sum(genres$year==year)>=1 & sum(genres$genre == j))
{
genres[genres$genre == j , "views"] = genres[genres$genre == j , "views"] + 1
genres[genres$genre == j , "rate"] = genres[genres$genre == j , "rate"] + rate
}
else
{
r = data.frame(genre = j , rate = rate , views = 1, year = year)
genres = rbind(genres , r)
}
}
}
The “genres” dataframe after some cleaning
#erasing The temp row and filtering the no genres
genres = genres[2:nrow(genres), ]
genres = genres %>% filter(genre != "(no genres listed)")
#show head of genres
head(genres)
## genre rate views year
## 1 Adventure 84752.5 24156 2000
## 2 Animation 25366.0 6988 2000
## 3 Children 31426.5 9208 2000
## 4 Comedy 132167.5 39047 2000
## 5 Fantasy 41312.5 11834 2000
## 6 Romance 63552.0 18124 2000
Now let’s say that we want to determine our movie’s genre. Maybe we want to know which genre has the best rate, because that means people will love our movie, right ? Let’s see the all the genres rates
# summing rows with the same genre together by summing the rate and views
genres_rates = as.data.frame(genres_rates %>% group_by(genre) %>% summarise_all(sum) )
#calculating the rate by : sum of rates / number of reviews
genres_rates$rate = genres_rates$rate / genres_rates$views
ggplot(genres_rates, aes(x= reorder(genre , rate ), y=rate , fill = genre)) +
geom_bar(stat = "identity") +
labs(x = "Genre" , y= "Rate" )+
ggtitle("Comparing with the rate of each genre accorfing to the rate itself")+
theme_ft_rc() +
theme(axis.text.x = element_text(angle = 90) )
genres-rates-rate
But who wants the people to love his movie !! , we’re talking money$$$ here So let’s reorder the genres by the number of views
ggplot(genres_rates, aes(x= reorder(genre , views ), y=rate , fill = genre)) +
geom_bar(stat = "identity") +
labs(x = "Genre" , y= "Rate" )+
ggtitle("Comparing with the rate of each genre accorfing to the views")+
theme_ft_rc() +
theme(axis.text.x = element_text(angle = 90) )
genres-rates-views
Okay okay let’s calm down for a moment, because the diffirence in rates are very little, let’s just compare all genres by the number of views
# plot genre and views
ggplot(genres_views, aes(x= reorder(genre , views ), y=views , fill = genre)) +
geom_bar(stat = "identity") +
labs(x = "Genre" , y= "Views" )+
ggtitle("Comparing with the views of each genre")+
theme_ft_rc() +
theme(axis.text.x = element_text(angle = 90) )
genres-views
Yes we want money, but we also need people to like our movie so maybe they will watch it again and again and. I’m saying this because “Comedy” is more in views numbers, but “Drama” is so close from it and “Drama” also ahead of “Comedy” in rate
So maybe we need to consider making our movie’s genre “Drama”
Okay Now our movie main genre is “Drama”, but what’s the main keywords it will include ?
luckly we have a tag table includes the movie id and the tag, so all we need to do is to get all the movies under “Drama” genre and see what tags they have
let’s collect the ids from the movies table
# collecting "Drama" movies ids by checking if the movie has the word "Drama" in its genre
ids = vector()
for(i in 1:nrow(movies2))
{
sp = strsplit(movies2[i , "genres"] , "\\|")[[1]]
if("Drama" %in% sp)
ids = c(ids , movies2[i , "movieId"])
}
We want the best “Drama” movies by rate, so we will filter the ratings table to take : - movie id , movie rating columns - movies which has id in our collected ids so we know it’s “Drama” - group that by the movie, because there was more than one rating for each movie - take the mean of the grouped movie rates - keep only movies have 5 start rate
drama = ratings[, names(ratings) %in% c("movieId" , "rating")]
drama = as.data.frame(drama %>% filter(movieId %in% ids )
%>% group_by(movieId)
%>% summarise_all(mean)
)
drama = drama[order( - drama$rating) , ]
drama = drama[drama$rating == 5, ]
head(drama)
## movieId rating
## 27 53 5
## 61 148 5
## 222 495 5
## 223 496 5
## 426 1140 5
## 428 1151 5
Now let’s Store the choosen movies titles to plot them
temp = data.frame(movie = movies2[movies2$movieId %in% drama$movieId , "title"] )
temp$freq = round(runif(nrow(temp) , 1 , 20))
And hereis a word cloud of some of the choosen movies
wordcloud2( temp )
movies-wordcloud
Now let’s take the tags the have
drama$movieId = as.character(drama$movieId)
filt = c("In Netflix queue" , "free to download" )
tags = tags %>% filter(movieId %in% drama$movieId & ! tag %in% filt )
tags = tags[order(tags$timestamp) , ]
tags = tags[1 :nrow(tags), "tag"]
temp = data.frame(word = tags)
head(temp)
## word
## 1 England
## 2 imagination
## 3 social commentary
## 4 creativity
## 5 dystopia
## 6 claymation
and plot them
temp$freq = round(runif(nrow(temp) , 1 ,10))
wordcloud2( temp )
tags-wordcloud
So we finally have our movie, it’ll be an atmospheric, no dialogue, dystopian, harsh, disturbing and bleak gritty movie involve a story of creaetivity and imagination that happens or involves “England”
Hmmm, Not very much the expected…. but I guess the internet has a very weird taste
We made it, We have our movie, we will be millionaires …. The only problem is we need money first to produce the movie :D